Setup

Inlcuding Continents in Dataset:

data <- read.csv("group_3_data.csv")
continents <- read.csv("Countries by continents.csv")
continents <- continents %>% rename(reference_area = Country)
data <- merge(data, continents, by = "reference_area")

General Stats

Statistics by continent (mean, median, sd, max, min):

datacont <- data[,-1] %>% group_by(Continent)
variables <- c("education", "employment", "absence_of_legal_discrimination", "access_to_productive_and_financial_assets", "reproductive_discrim", "restricted_civil_liberties", "discrimination_in_the_family")
statstable_bycont <- datacont %>% group_by(Continent) %>% 
  summarise(across(all_of(variables), list(mean = mean,
                                           median = median,
                                           sd = sd,
                                           max = max,
                                           min = min)))
statstable_bycont <- data.frame(statstable_bycont)
statstable_bycont
##       Continent education_mean education_median education_sd education_max
## 1        Africa       6.219914         6.144916   2.28506222       9.65739
## 2          Asia       9.200349         9.232720   2.60907629      13.30556
## 3        Europe      12.140683        12.224860   1.13651010      13.84053
## 4 North America       9.383249         9.096670   3.86157270      13.91498
## 5       Oceania      12.873235        12.873235   0.08859335      12.93588
## 6 South America       9.517324         9.234145   1.03437457      11.35365
##   education_min employment_mean employment_median employment_sd employment_max
## 1      2.161845        55.00311          58.90759     19.512818       81.04615
## 2      3.896490        49.56838          54.04757     20.638198       76.96047
## 3      9.778190        69.68894          71.93355      9.169670       79.98667
## 4      4.552000        58.49051          57.45274     10.928704       75.06139
## 5     12.810590        76.73362          76.73362      2.103119       78.22075
## 6      8.333614        63.29520          62.82961      8.258768       75.80638
##   employment_min absence_of_legal_discrimination_mean
## 1       25.60620                             77.50000
## 2       14.21400                             70.91346
## 3       46.26808                             93.48558
## 4       43.04505                             82.32143
## 5       75.24649                             97.18750
## 6       53.31744                             86.32812
##   absence_of_legal_discrimination_median absence_of_legal_discrimination_sd
## 1                                76.8750                          8.3560887
## 2                                75.6250                         13.3258818
## 3                                93.7500                          5.8018668
## 4                                86.2500                         13.0389755
## 5                                97.1875                          0.4419417
## 6                                86.8750                          5.2124240
##   absence_of_legal_discrimination_max absence_of_legal_discrimination_min
## 1                              88.125                              64.375
## 2                              90.625                              46.875
## 3                             100.000                              78.750
## 4                             100.000                              61.250
## 5                              97.500                              96.875
## 6                              95.000                              79.375
##   access_to_productive_and_financial_assets_mean
## 1                                       30.10000
## 2                                       33.22308
## 3                                       11.64231
## 4                                       20.94286
## 5                                       18.85000
## 6                                       26.52500
##   access_to_productive_and_financial_assets_median
## 1                                            35.10
## 2                                            38.80
## 3                                             7.00
## 4                                            18.60
## 5                                            18.85
## 6                                            20.65
##   access_to_productive_and_financial_assets_sd
## 1                                    10.734990
## 2                                    13.638191
## 3                                     9.945539
## 4                                     7.351611
## 5                                     2.050610
## 6                                    16.262380
##   access_to_productive_and_financial_assets_max
## 1                                          42.6
## 2                                          50.4
## 3                                          39.6
## 4                                          34.0
## 5                                          20.3
## 6                                          60.6
##   access_to_productive_and_financial_assets_min reproductive_discrim_mean
## 1                                          12.0                 47.222222
## 2                                           3.5                 53.846154
## 3                                           0.8                  1.923077
## 4                                          15.1                 75.000000
## 5                                          17.4                  0.000000
## 6                                           8.6                 43.750000
##   reproductive_discrim_median reproductive_discrim_sd reproductive_discrim_max
## 1                        50.0               29.166667                       75
## 2                        75.0               37.977726                      100
## 3                         0.0                9.805807                       50
## 4                        75.0               35.355339                      100
## 5                         0.0                0.000000                        0
## 6                        62.5               37.201190                       75
##   reproductive_discrim_min restricted_civil_liberties_mean
## 1                        0                        26.43333
## 2                        0                        35.96923
## 3                        0                        17.31538
## 4                        0                        24.94286
## 5                        0                        19.70000
## 6                        0                        10.40000
##   restricted_civil_liberties_median restricted_civil_liberties_sd
## 1                             25.50                     19.329964
## 2                             36.50                     16.529891
## 3                             17.75                      5.656620
## 4                             20.50                     17.755267
## 5                             19.70                      3.959798
## 6                              8.05                      7.012438
##   restricted_civil_liberties_max restricted_civil_liberties_min
## 1                           52.3                            2.1
## 2                           55.4                           11.5
## 3                           27.5                            3.1
## 4                           61.7                            5.9
## 5                           22.5                           16.9
## 6                           26.9                            4.5
##   discrimination_in_the_family_mean discrimination_in_the_family_median
## 1                          49.78889                                44.3
## 2                          56.70000                                56.9
## 3                          14.90769                                 8.3
## 4                          26.91429                                29.2
## 5                           7.90000                                 7.9
## 6                          21.72500                                21.7
##   discrimination_in_the_family_sd discrimination_in_the_family_max
## 1                        20.16541                             80.2
## 2                        26.91694                             89.1
## 3                        13.04178                             42.3
## 4                        15.02114                             49.8
## 5                         0.00000                              7.9
## 6                        10.03106                             36.7
##   discrimination_in_the_family_min
## 1                             18.8
## 2                             14.6
## 3                              0.0
## 4                             10.6
## 5                              7.9
## 6                              8.5

Statistics overall (mean, median, sd, max, min):

statstable <- data.frame(Mean = sapply(data[variables], mean),
                         Median = sapply(data[variables], median),
                         `Standard Deviation` = sapply(data[variables], sd),
                         Maximum = sapply(data[variables], max),
                         Minimum = sapply(data[variables], min))
statstable
##                                               Mean   Median Standard.Deviation
## education                                 10.13553 10.88014           2.873973
## employment                                61.85526 64.48421          15.800035
## absence_of_legal_discrimination           84.78846 87.50000          12.536216
## access_to_productive_and_financial_assets 21.56923 20.30000          14.253935
## reproductive_discrim                      31.53846  0.00000          37.831071
## restricted_civil_liberties                22.35231 18.80000          14.653958
## discrimination_in_the_family              30.01231 21.90000          24.546903
##                                             Maximum   Minimum
## education                                  13.91498  2.161845
## employment                                 81.04615 14.213997
## absence_of_legal_discrimination           100.00000 46.875000
## access_to_productive_and_financial_assets  60.60000  0.800000
## reproductive_discrim                      100.00000  0.000000
## restricted_civil_liberties                 61.70000  2.100000
## discrimination_in_the_family               89.10000  0.000000

Means of just education and employment by continent (Q1+3):

minitable <- datacont %>% group_by(Continent) %>% summarize(`Mean Education` = mean(education),`Mean Employment` = mean(employment))
minitable
## # A tibble: 6 × 3
##   Continent     `Mean Education` `Mean Employment`
##   <chr>                    <dbl>             <dbl>
## 1 Africa                    6.22              55.0
## 2 Asia                      9.20              49.6
## 3 Europe                   12.1               69.7
## 4 North America             9.38              58.5
## 5 Oceania                  12.9               76.7
## 6 South America             9.52              63.3

Means of just education and access to financials by continent (Q2):

minitable2 <- datacont %>% group_by(Continent) %>% summarize(`Mean Education` = mean(education),`Mean Access` = mean(access_to_productive_and_financial_assets))
minitable2
## # A tibble: 6 × 3
##   Continent     `Mean Education` `Mean Access`
##   <chr>                    <dbl>         <dbl>
## 1 Africa                    6.22          30.1
## 2 Asia                      9.20          33.2
## 3 Europe                   12.1           11.6
## 4 North America             9.38          20.9
## 5 Oceania                  12.9           18.8
## 6 South America             9.52          26.5

Correlation Matrix:

cormatrix <- as.data.frame(cor(data[,variables]))
cormatrix
##                                            education employment
## education                                  1.0000000  0.5649636
## employment                                 0.5649636  1.0000000
## absence_of_legal_discrimination            0.6404098  0.7090588
## access_to_productive_and_financial_assets -0.5384356 -0.5388207
## reproductive_discrim                      -0.5460163 -0.3990825
## restricted_civil_liberties                -0.3184921 -0.3202537
## discrimination_in_the_family              -0.6249280 -0.7398914
##                                           absence_of_legal_discrimination
## education                                                       0.6404098
## employment                                                      0.7090588
## absence_of_legal_discrimination                                 1.0000000
## access_to_productive_and_financial_assets                      -0.7074188
## reproductive_discrim                                           -0.5808030
## restricted_civil_liberties                                     -0.6622231
## discrimination_in_the_family                                   -0.8086819
##                                           access_to_productive_and_financial_assets
## education                                                                -0.5384356
## employment                                                               -0.5388207
## absence_of_legal_discrimination                                          -0.7074188
## access_to_productive_and_financial_assets                                 1.0000000
## reproductive_discrim                                                      0.4734101
## restricted_civil_liberties                                                0.4340695
## discrimination_in_the_family                                              0.5854146
##                                           reproductive_discrim
## education                                           -0.5460163
## employment                                          -0.3990825
## absence_of_legal_discrimination                     -0.5808030
## access_to_productive_and_financial_assets            0.4734101
## reproductive_discrim                                 1.0000000
## restricted_civil_liberties                           0.3043340
## discrimination_in_the_family                         0.4776794
##                                           restricted_civil_liberties
## education                                                 -0.3184921
## employment                                                -0.3202537
## absence_of_legal_discrimination                           -0.6622231
## access_to_productive_and_financial_assets                  0.4340695
## reproductive_discrim                                       0.3043340
## restricted_civil_liberties                                 1.0000000
## discrimination_in_the_family                               0.6327375
##                                           discrimination_in_the_family
## education                                                   -0.6249280
## employment                                                  -0.7398914
## absence_of_legal_discrimination                             -0.8086819
## access_to_productive_and_financial_assets                    0.5854146
## reproductive_discrim                                         0.4776794
## restricted_civil_liberties                                   0.6327375
## discrimination_in_the_family                                 1.0000000

VIF values:

reg_for_vif <- lm(employment ~ education + absence_of_legal_discrimination + access_to_productive_and_financial_assets + reproductive_discrim + restricted_civil_liberties + discrimination_in_the_family, data)
vif(reg_for_vif)
##                                 education 
##                                  2.081537 
##           absence_of_legal_discrimination 
##                                  4.880672 
## access_to_productive_and_financial_assets 
##                                  2.064374 
##                      reproductive_discrim 
##                                  1.656528 
##                restricted_civil_liberties 
##                                  2.008626 
##              discrimination_in_the_family 
##                                  3.318365

Graphs for Questions 1 and 3

Seperate Boxplots:

ggplot(data, aes(x = Continent, y = education)) + geom_boxplot() + labs(title = "Women's Education by Continent", y = "Education")

ggplot(data, aes(x = Continent, y = employment)) + geom_boxplot() + labs(title = "Women's Employment by Continent", y = "Employment")

Combined Boxplots:

ggplot(data) +
  geom_boxplot(aes(x = Continent, y = education, fill = "education")) +
  geom_boxplot(aes(x = Continent, y = employment, fill = "employment")) +
  labs(title = "Boxplots of Education and Employment", y = "Education and Employment", x = "Continent")

Employment Regressions

slr_employment_education <- lm(employment ~ education, data = data)

ggplot(data, aes(x = education, y = employment)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs(title = "Education vs Employment")
## `geom_smooth()` using formula = 'y ~ x'

e <- resid(slr_employment_education)

qqnorm(e)
qqline(e, col = "blue")

plot(fitted(slr_employment_education), e)
abline(h = 0, col = "blue")

summary(slr_employment_education)
## 
## Call:
## lm(formula = employment ~ education, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.443  -6.298   2.328   7.668  26.507 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  30.3747     6.0174   5.048 4.07e-06 ***
## education     3.1060     0.5715   5.435 9.46e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.14 on 63 degrees of freedom
## Multiple R-squared:  0.3192, Adjusted R-squared:  0.3084 
## F-statistic: 29.54 on 1 and 63 DF,  p-value: 9.456e-07

The Q-Q plot shows the residuals are normally distributed with some extremes at the tails. There is no visual pattern in the residual plot. Data points for the predictor and response variables are independent. Conditions are valid for creating a linear regression model. The fitted equation for this model is:

\[\hat{Employment} = 31.8551 + 2.9900 * Education\]

The model itself is statistically significant from the p-value of its F-test, 9.456e-07 < \(\alpha\) = 0.05; however, \(R^2\) shows that only 31.92% of the variation in employment is explained by the linear association with education in the linear regression model. This regression could be improved by adding more predictor parameters in a multiple regression.

multiple_employment <- lm(employment ~  discrimination_in_the_family + restricted_civil_liberties + absence_of_legal_discrimination, data = data)
summary(multiple_employment)
## 
## Call:
## lm(formula = employment ~ discrimination_in_the_family + restricted_civil_liberties + 
##     absence_of_legal_discrimination, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.9510  -4.4503  -0.5388   4.7802  22.2083 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     13.03642   17.14084   0.761 0.449857    
## discrimination_in_the_family    -0.37606    0.08439  -4.456 3.63e-05 ***
## restricted_civil_liberties       0.39578    0.11097   3.566 0.000711 ***
## absence_of_legal_discrimination  0.60455    0.17076   3.540 0.000772 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.508 on 61 degrees of freedom
## Multiple R-squared:  0.6548, Adjusted R-squared:  0.6379 
## F-statistic: 38.58 on 3 and 61 DF,  p-value: 4.18e-14

Where DIF = discrimination_in_the_family, RCL = restricted_civil_liberties, and absence_of_legal_discrimination = ALD:

\[\hat{Employment} = 13.03642 - 0.37606 * DIF + 0.39578 * RCL + 0.60455 * ALD\]

The multiple regression with this combination of predictor variables results in a statistically significant model with statistically significant parameters. The \(R^2_{adj}\) = 0.6379 on account for adding the new predictors. This model explains an additional 31.87% variation in employment compared to the simple linear regression. The simple linear regression only took into account education, but this model focusing on the presence / absence of liberties and discrimination appears to be a better model of prediction.

ggplot(data, aes(x = discrimination_in_the_family + restricted_civil_liberties + absence_of_legal_discrimination, y = employment)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs(title = "Employment multiple regression")
## `geom_smooth()` using formula = 'y ~ x'

e <- resid(multiple_employment)

qqnorm(e)
qqline(e, col = "blue")

plot(fitted(multiple_employment), e)
abline(h = 0, col = "blue")

boxcox(multiple_employment)

Education vs Employment Plot by Continent:

ggplot(data, aes(x = education, y = employment, color = Continent)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs(title = "Education vs Employment by Country")
## `geom_smooth()` using formula = 'y ~ x'

Graphs for Question 2

Seperate Boxplots:

ggplot(data, aes(x = Continent, y = education)) + geom_boxplot() + labs(title = "Women's Education by Continent", y = "Education")

ggplot(data, aes(x = Continent, y = access_to_productive_and_financial_assets)) + geom_boxplot() + labs(title = "Women's Access to Productive and Financial Assets by Continent", y = "Access to Productive and Financial Assets")

Combined Boxplots:

boxplot <- ggplot(data) +
  geom_boxplot(aes(x = Continent, y = education, fill = "education")) +
  geom_boxplot(aes(x = Continent, y = access_to_productive_and_financial_assets, fill = "Access to Productive and Financial Assets")) +
  labs(title = "Boxplots of Education and Access to Productive and Financial Assets", y = "Education and Access to Productive and Financial Assets", x = "Continent")
ggplotly(boxplot) %>% layout(boxmode = "group")
## Warning: 'layout' objects don't have these attributes: 'boxmode'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

Education vs Access by Continent:

ggplot(data, aes(x = education, y = access_to_productive_and_financial_assets, color = Continent)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + labs(title = "Education vs Access to Productive and Financial Assets")
## `geom_smooth()` using formula = 'y ~ x'